import os
import json
import math
from pathlib import Path

def split_jsonl_files(source_dir, train_output="train_combined.jsonl", test_output="test_combined.jsonl", train_ratio=0.85):
    """
    Split multiple JSONL files into training and test sets.
    
    Args:
        source_dir (str): Directory containing JSONL files
        train_output (str): Output file for training data
        test_output (str): Output file for test data
        train_ratio (float): Proportion of data to use for training (0.0-1.0)
    """
    # Convert to Path object for easier path manipulation
    source_path = Path(source_dir)
    
    # Get all JSONL files in the directory
    jsonl_files = list(source_path.glob("*.jsonl"))
    print(f"Found {len(jsonl_files)} JSONL files to process")
    
    # Open output files
    with open(train_output, "w", encoding="utf-8") as train_file, \
         open(test_output, "w", encoding="utf-8") as test_file:
        
        # Process each JSONL file
        for i, file_path in enumerate(jsonl_files):
            print(f"Processing file {i+1}/{len(jsonl_files)}: {file_path.name}")
            
            # Read all lines from the file
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            
            # Calculate split point
            split_index = math.floor(len(lines) * train_ratio)
            
            # Split into training and testing sets
            train_lines = lines[:split_index]
            test_lines = lines[split_index:]
            
            print(f"  - Total lines: {len(lines)}")
            print(f"  - Training lines: {len(train_lines)}")
            print(f"  - Testing lines: {len(test_lines)}")
            
            # Write to output files
            train_file.writelines(train_lines)
            test_file.writelines(test_lines)
    
    print("Processing complete!")
    print(f"Training data saved to: {train_output}")
    print(f"Test data saved to: {test_output}")

if __name__ == "__main__":
  
    source_dir = ".../ToMbench_data"
    train_output = ".../ToMbench_data/train_combined.jsonl"
    test_output = ".../ToMbench_data/test_combined.jsonl"
    train_ratio = 0.85 

    split_jsonl_files(source_dir, train_output, test_output, train_ratio)